import sys
sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os
os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from tqdm import tqdm
import pandas as pd
from collections import Counter
from sem_covid.services.data_registry import Dataset
import spacy
from cleantext import clean
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.express as px
from IPython.display import display,Markdown
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation
TEXTUAL_COLUMNS = ['title','background_info_description','content_of_measure_description','use_of_measure_description','involvement_of_social_partners_description']
df = Dataset.PWDB.fetch()
nlp = spacy.load("en_core_web_sm",exclude=["lemmatizer"])
tokenizer = nlp.tokenizer
def calculate_frequency(data : pd.Series,title : str,relative = False):
observation_type_name = 'Absolute freq' if not relative else 'Relative freq'
data.dropna(inplace=True)
observation = pd.DataFrame(Counter(data).most_common(10),columns=[title,observation_type_name])
if relative:
observation[observation_type_name]/=observation[observation_type_name].sum()/100
observation[observation_type_name]=round(observation[observation_type_name],2)
return observation
def get_nlp_docs(data : pd.Series):
return [nlp(row) for row in data]
def get_named_entities(data: pd.Series,docs : list = None):
if docs is None:
docs = get_nlp_docs(data)
result = [e.label_ for doc in docs for e in doc.ents]
return pd.Series(result,dtype=str)
def get_entity_words(data: pd.Series, entity_class: str = 'ORG', docs: list = None):
if docs is None:
docs = get_nlp_docs(data)
result = [e.text for doc in docs for e in doc.ents if e.label_== entity_class]
return pd.Series(result,dtype=str)
def calculate_tf_idf(data : pd.Series,title : str):
vectorizer = TfidfVectorizer()
documents = data.apply(remove_stopwords)
vectors = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tmp_df = pd.DataFrame(denselist, columns=feature_names)
tmp_df = tmp_df.max().sort_values(ascending=False).reset_index()
tmp_df.columns = [title,"TF-IDF"]
return tmp_df
def get_ngrams(data : pd.Series, n : int,stopwords : bool = True):
if not stopwords:
data = data.apply(remove_stopwords)
result = [ " ".join(text[i:i+n])
for text in data.str.split()
for i in range(len(text)-n+1)]
return pd.Series(result,dtype=str)
def get_noun_phrases(data : pd.Series,docs : list = None):
if docs is None:
docs = get_nlp_docs(data)
result = [str(n) for doc in docs for n in doc.noun_chunks ]
return pd.Series(result,dtype=str)
def get_words(data : pd.Series):
text = " ".join(data)
result = tokenizer(remove_stopwords(text))
return pd.Series(result,dtype=str)
def prepare_text_data(data : pd.Series ):
data = data.dropna().explode()
result = [ strip_punctuation(
clean(text,
no_urls=True,
no_emails=True,
no_phone_numbers=True,
)
)
for text in data
]
return pd.Series(result,dtype=str)
def plot_bar_chart(observations: pd.DataFrame,chart_title: str):
columns = observations.columns
return px.bar(observations,x=columns[1],y=columns[0],title=chart_title)
def plot_pie_chart(observations: pd.DataFrame,chart_title: str):
columns = observations.columns
return px.pie(observations,values=columns[1],names=columns[0],title=chart_title )
def eda_display_result(result : pd.DataFrame,chart_title : str, bar_char : bool = True,pie_chart : bool = True):
if result.size > 0 :
display(Markdown(chart_title))
display(tuple(result))
if bar_char:
plot_bar_chart(result, chart_title).show()
if pie_chart:
plot_pie_chart(result, chart_title).show()
def eda_entity_words( data : pd.Series,data_title : str, docs ):
result = calculate_frequency(get_entity_words(data,'ORG',docs),data_title,True)
eda_display_result(result,"Entity words for "+data_title)
def eda_words_freq( data : pd.Series,data_title : str):
result = calculate_frequency(get_words(data),data_title,True)
eda_display_result(result,"Words frequency for "+data_title)
def eda_named_entities( data : pd.Series,data_title : str, docs ):
result = calculate_frequency(get_named_entities(data,docs),data_title,True)
eda_display_result(result,"Named entities for "+data_title)
def eda_noun_phrases( data : pd.Series,data_title : str, docs ):
result = calculate_frequency(get_noun_phrases(data,docs),data_title,True)
eda_display_result(result,"Noun phrases for "+data_title)
def eda_n_grams( data : pd.Series,data_title : str, n_grams):
result = calculate_frequency(get_ngrams(data,n_grams),data_title,True)
eda_display_result(result,"N grams for "+data_title)
def eda_n_grams_without_stopwords( data : pd.Series,data_title : str, n_grams):
result = calculate_frequency(get_ngrams(data,n_grams,False),data_title,True)
eda_display_result(result,"N grams without stopwords for "+data_title)
def eda_tf_idf(data : pd.Series, data_title : str):
result = calculate_tf_idf(data,data_title)
eda_display_result(result.head(10),"TOP 10 TF-IDF for "+data_title,pie_chart=False)
def eda_textual(data : pd.DataFrame):
pbar = tqdm(data.columns)
for column_name in pbar:
pbar.set_description('Eda on textual data ['+column_name+']')
column_data = prepare_text_data(data[column_name])
docs = get_nlp_docs(column_data)
eda_words_freq(column_data,column_name)
eda_n_grams(column_data,column_name,3)
eda_n_grams_without_stopwords(column_data,column_name,3)
eda_noun_phrases(column_data,column_name,docs)
eda_named_entities(column_data,column_name,docs)
eda_entity_words(column_data,column_name,docs)
eda_tf_idf(column_data,column_name)
eda_textual(df[TEXTUAL_COLUMNS])
Eda on textual data [title]: 0%| | 0/5 [00:00<?, ?it/s]
Words frequency for title
('title', 'Relative freq')
N grams for title
('title', 'Relative freq')
N grams without stopwords for title
('title', 'Relative freq')
Noun phrases for title
('title', 'Relative freq')
Named entities for title
('title', 'Relative freq')
Entity words for title
('title', 'Relative freq')
TOP 10 TF-IDF for title
('title', 'TF-IDF')
Eda on textual data [background_info_description]: 20%|██ | 1/5 [00:04<00:18, 4.66s/it]
Words frequency for background_info_description
('background_info_description', 'Relative freq')
N grams for background_info_description
('background_info_description', 'Relative freq')
N grams without stopwords for background_info_description
('background_info_description', 'Relative freq')
Noun phrases for background_info_description
('background_info_description', 'Relative freq')
Named entities for background_info_description
('background_info_description', 'Relative freq')
Entity words for background_info_description
('background_info_description', 'Relative freq')
TOP 10 TF-IDF for background_info_description
('background_info_description', 'TF-IDF')
Eda on textual data [content_of_measure_description]: 40%|████ | 2/5 [00:22<00:37, 12.42s/it]
Words frequency for content_of_measure_description
('content_of_measure_description', 'Relative freq')
N grams for content_of_measure_description
('content_of_measure_description', 'Relative freq')
N grams without stopwords for content_of_measure_description
('content_of_measure_description', 'Relative freq')
Noun phrases for content_of_measure_description
('content_of_measure_description', 'Relative freq')
Named entities for content_of_measure_description
('content_of_measure_description', 'Relative freq')
Entity words for content_of_measure_description
('content_of_measure_description', 'Relative freq')
TOP 10 TF-IDF for content_of_measure_description
('content_of_measure_description', 'TF-IDF')
Eda on textual data [use_of_measure_description]: 60%|██████ | 3/5 [00:51<00:39, 19.96s/it]
Words frequency for use_of_measure_description
('use_of_measure_description', 'Relative freq')
N grams for use_of_measure_description
('use_of_measure_description', 'Relative freq')
N grams without stopwords for use_of_measure_description
('use_of_measure_description', 'Relative freq')
Noun phrases for use_of_measure_description
('use_of_measure_description', 'Relative freq')
Named entities for use_of_measure_description
('use_of_measure_description', 'Relative freq')
TOP 10 TF-IDF for use_of_measure_description
('use_of_measure_description', 'TF-IDF')
Eda on textual data [involvement_of_social_partners_description]: 80%|████████ | 4/5 [01:01<00:16, 16.02s/it]
Words frequency for involvement_of_social_partners_description
('involvement_of_social_partners_description', 'Relative freq')
N grams for involvement_of_social_partners_description
('involvement_of_social_partners_description', 'Relative freq')
N grams without stopwords for involvement_of_social_partners_description
('involvement_of_social_partners_description', 'Relative freq')
Noun phrases for involvement_of_social_partners_description
('involvement_of_social_partners_description', 'Relative freq')
Named entities for involvement_of_social_partners_description
('involvement_of_social_partners_description', 'Relative freq')
TOP 10 TF-IDF for involvement_of_social_partners_description
('involvement_of_social_partners_description', 'TF-IDF')
Eda on textual data [involvement_of_social_partners_description]: 100%|██████████| 5/5 [01:08<00:00, 13.67s/it]